
clear
set more off

/*********************************************************************************
Name: intermediate_datasets.do

Data In: [Data/Original/hw_roster.dta,
		  Data/Original/observation_days_centers.dta,
		  Data/Original/center_gps_coordinates.dta]

Data Out: [Data/Intermediate/hw_roster_by_center.dta,
		   Data/Intermediate/hw_roster_by_hw.dta,
		   Data/Intermediate/hw_roster_by_month.dta,
		   Data/Intermediate/center_monthly_gps_coordinates.dta]

Results Out: []

Purpose of do-file: Creating intermediate datasets used for the analysis

Organization: PART-1: Creating multiple reshapings of the health worker roster
			  PART-2: Creating a monthly-level dataset with GPS coordinates of TB treatment centers
*********************************************************************************/

* Setting path directory
cd "${DIRECTORY}"


****************************************
*** PART-1 *** Creating multiple reshapings of the health worker roster
****************************************

** Ordered by center

use "Data/Original/hw_roster.dta", clear
so UID_Center
by UID_Center: egen some_attrition = max(attritor)
by UID_Center: gen id = _n
keep if id == 1
drop id
so UID_Center
save "Data/Intermediate/hw_roster_by_center.dta", replace


** Ordered by health worker

use "Data/Original/hw_roster.dta", clear
so Unique_ID
by Unique_ID: gen id = _n
keep if id == 1
drop id
so Unique_ID
save "Data/Intermediate/hw_roster_by_hw.dta", replace


** Containing one observation per month

use "Data/Original/hw_roster.dta"

gen MonthIntoExp_start = joindate_m - expstartdate_m + 17 if !missing(joindate_m)
gen MonthIntoExp_termin = termindate_m - expstartdate_m + 17 if !missing(termindate_m)

replace MonthIntoExp_start = 17 if missing(joindate_m)
replace MonthIntoExp_termin = 35 if missing(termindate_m)

* Checking overlaps in MonthIntoExp and breaking ties

sort UID_Center MonthIntoExp_start MonthIntoExp_termin
by UID_Center: gen counselor_lag =  MonthIntoExp_start[_n] - MonthIntoExp_termin[_n - 1]
tab counselor_lag
// Few cases where months overlap

gen last_month_days = termindate_d if attritor
gen first_month_days = mdy(month(joindate)+1,01,year(joindate)) - joindate
replace first_month_days = mdy(12,31,year(joindate)) - joindate + 1 if month(joindate)==12

* "Giving" month to the health worker with larger number of days in that month
replace MonthIntoExp_start = MonthIntoExp_start + 1 if first_month_days[_n] < last_month_days[_n - 1] & counselor_lag == 0 & UID_Center == UID_Center[_n - 1]
replace MonthIntoExp_termin = MonthIntoExp_termin - 1 if first_month_days[_n+1] > last_month_days[_n] & counselor_lag[_n+1] == 0 & UID_Center == UID_Center[_n + 1]

by UID_Center: gen counselor_lag_2 =  MonthIntoExp_start[_n] - MonthIntoExp_termin[_n - 1]
tab counselor_lag_2 
// No overlap anymore
drop counselor_lag counselor_lag_2 

* Creating MonthIntoExp variable for each center and health worker
forvalues i = 1/35 {
	gen Unique_ID`i' = .
	replace Unique_ID`i' = Unique_ID if inrange(`i',MonthIntoExp_start,MonthIntoExp_termin)	
	
	* Checking that number of health workers per month in each center is  = 0 or 1
	bys UID_Center: egen count_month_`i' = count(Unique_ID`i')
	
	ta count_month_`i'
	assert count_month_`i' <= 1
	
	drop count_month_`i'
}

* Reshaping to get one record per center-health worker
collapse (mean) Unique_ID? Unique_ID??, by(UID_Center)
reshape long Unique_ID, i(UID_Center) j(MonthIntoExp)

* Assuming that the health worker at month-1 was the health worker in all months before experiment start; and allocating previous health worker when transition between health workers lasts more than 1 month
gsort UID_Center -MonthIntoExp
by UID_Center: replace Unique_ID = Unique_ID[_n-1] if missing(Unique_ID) & MonthIntoExp <= 16

gsort UID_Center MonthIntoExp
by UID_Center: replace Unique_ID = Unique_ID[_n-1] if missing(Unique_ID) & MonthIntoExp > 17

* Replacing MonthIntoExp with original values
replace MonthIntoExp = MonthIntoExp - 16

save "Data/Intermediate/hw_roster_by_month.dta", replace


****************************************
*** PART-2 *** Creating a monthly-level dataset with GPS coordinates of TB treatment centers
****************************************

** Using GPS data collected during information days

use "Data/Original/observation_days_centers.dta", clear

keep Unique_ID UID_Center unique_mon_instance evening_form_yn visit_date starttime_hh starttime_mm start_gps_lat start_gps_long

duplicates drop *, force

sort UID_Center visit_date starttime_hh starttime_mm
bys UID_Center: gen visit_number = _n

** Using center-level GPS coordinates dataset

merge m:1 UID_Center using "Data/Original/center_gps_coordinates.dta", gen(_mergeGPS)
drop _merge*

** Creating dataset with one record per month per center

* Creating new variables to record month of shifting center

gen shift_center = .
gen shift_date = .
gen shift_month = .
gen shift_month_days = .

local shifted_centers `" "Sudama Nagar/telibandha" "Shakarpur" "Chakyasahni" "Annanagar" "Aisbag" "Gudi Guda ka Naka" "Malwa mill/bhagirath pura" "Musakhedi" "'
local n_centers : word count `shifted_centers'

local shift_months "32 44 12 35 13 32 13 15"
assert `n_centers'==`:word count `shift_months''

forvalues i=1/`n_centers' {
	
	local center `: word `i' of `shifted_centers''
	local shiftmonth `:word `i' of `shift_months''
	
	replace shift_center = 1 if (visit_number >= `shiftmonth' & center == "`center'" & !inlist("`center'","Aisbag","Malwa mill/bhagirath pura")) ///
							  |(visit_number < `shiftmonth' & center == "`center'" & inlist("`center'","Aisbag","Malwa mill/bhagirath pura")) ///
	
	replace shift_center = 0 if shift_center == . & center == "`center'"
	
	su visit_date if shift_center == 1 & center == "`center'"
	replace shift_date = `r(mean)' if center == "`center'"
	replace shift_month = mofd(`r(mean)') if center == "`center'"
	replace shift_month_days = day(`r(mean)') if center == "`center'"
	replace shift_month = shift_month + 1 if shift_month_days > 15 & center == "`center'"

}

format shift_date %td

* Creating variable for GPS coordinates after shifting (taking average of observation days data when center_gps_coordinates.dta is incorrect)

bys UID_Center: egen temp_lat = mean(start_gps_lat) if shift_center == 1
bys UID_Center: egen temp_long = mean(start_gps_long) if shift_center == 1
qui summ start_gps_lat if center == "Shakarpur" & shift_center == 1 & visit_number != 58
replace  temp_lat = `r(mean)' if center == "Shakarpur" & shift_center == 1

qui summ start_gps_long if center == "Shakarpur" & shift_center == 1 & visit_number != 58
replace  temp_long = `r(mean)' if center == "Shakarpur" & shift_center == 1

bys UID_Center: egen gps_long_shift = max(temp_long)
bys UID_Center: egen gps_lat_shift = max(temp_lat)

* Keeping unique records per center

bys UID_Center:gen temp = _n
keep if temp == 1
keep UID_Center Unique_ID center expstartdate expenddate expstartdate_m expenddate_m shift_date shift_month shift_month_days gps_lat_shift gps_long_shift centergpslat centergpslong

* Creating MonthIntoExp variables

assert shift_month <= expenddate_m if !missing(shift_month)
gen MonthIntoExp_shift = shift_month - expstartdate_m + 17 if !missing(shift_month)

forvalues i = 1/35 {
	
	gen gps_lat_final`i' = .
	gen gps_long_final`i' = .
	
	replace gps_lat_final`i' = gps_lat_shift if `i' >= MonthIntoExp_shift & !missing(MonthIntoExp_shift) & !inlist(center,"Aisbag","Malwa mill/bhagirath pura")
	replace gps_lat_final`i' = gps_lat_shift if `i' < MonthIntoExp_shift & !missing(MonthIntoExp_shift) & inlist(center,"Aisbag","Malwa mill/bhagirath pura")
	
	replace gps_long_final`i' = gps_long_shift if `i' >= MonthIntoExp_shift & !missing(MonthIntoExp_shift) & !inlist(center,"Aisbag","Malwa mill/bhagirath pura")
	replace gps_long_final`i' = gps_long_shift if `i' < MonthIntoExp_shift & !missing(MonthIntoExp_shift) & inlist(center,"Aisbag","Malwa mill/bhagirath pura")
	
	replace gps_lat_final`i' = centergpslat if missing(gps_lat_final`i')
	replace gps_long_final`i' = centergpslong if missing(gps_long_final`i')

}

* Reshaping file to get one record per center-month

reshape long gps_lat_final gps_long_final, i(UID_Center) j(MonthIntoExp)
replace MonthIntoExp = MonthIntoExp - 16

* Keeping relevant variables and merging to create one record per center per month with GPS and health worker information

keep UID_Center center MonthIntoExp gps_lat_final gps_long_final

rename gps_lat_final centergpslat 
rename gps_long_final centergpslong

merge 1:1 UID_Center MonthIntoExp using "Data/Intermediate/hw_roster_by_month.dta"
assert _merge == 3
drop _merge

order UID_Center center MonthIntoExp Unique_ID centergpslat centergpslong

save "Data/Intermediate/center_monthly_gps_coordinates.dta", replace
